{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. 获取数据\n",
    "# 2. 处理数据\n",
    "# 2.1 缺失值处理\n",
    "# 2.2 确定特征值，目标值\n",
    "# 2.3 训练数据和测试数据的划分\n",
    "# 3. 特征工程-数据标准化\n",
    "# 4. 机器学习-逻辑回归\n",
    "# 5. 模型评估"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import classification_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. 获取数据\n",
    "# 1.获取数据\n",
    "names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',\n",
    "                   'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',\n",
    "                   'Normal Nucleoli', 'Mitoses', 'Class']\n",
    "\n",
    "data = pd.read_csv(\"https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data\",\n",
    "                  names=names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sample code number</th>\n",
       "      <th>Clump Thickness</th>\n",
       "      <th>Uniformity of Cell Size</th>\n",
       "      <th>Uniformity of Cell Shape</th>\n",
       "      <th>Marginal Adhesion</th>\n",
       "      <th>Single Epithelial Cell Size</th>\n",
       "      <th>Bare Nuclei</th>\n",
       "      <th>Bland Chromatin</th>\n",
       "      <th>Normal Nucleoli</th>\n",
       "      <th>Mitoses</th>\n",
       "      <th>Class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1000025</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1002945</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>7</td>\n",
       "      <td>10</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1015425</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1016277</td>\n",
       "      <td>6</td>\n",
       "      <td>8</td>\n",
       "      <td>8</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1017023</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Sample code number  Clump Thickness  Uniformity of Cell Size  \\\n",
       "0             1000025                5                        1   \n",
       "1             1002945                5                        4   \n",
       "2             1015425                3                        1   \n",
       "3             1016277                6                        8   \n",
       "4             1017023                4                        1   \n",
       "\n",
       "   Uniformity of Cell Shape  Marginal Adhesion  Single Epithelial Cell Size  \\\n",
       "0                         1                  1                            2   \n",
       "1                         4                  5                            7   \n",
       "2                         1                  1                            2   \n",
       "3                         8                  1                            3   \n",
       "4                         1                  3                            2   \n",
       "\n",
       "  Bare Nuclei  Bland Chromatin  Normal Nucleoli  Mitoses  Class  \n",
       "0           1                3                1        1      2  \n",
       "1          10                3                2        1      2  \n",
       "2           2                3                1        1      2  \n",
       "3           4                3                7        1      2  \n",
       "4           1                3                1        1      2  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.to_csv('../../../../data/癌症.csv')\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2. 处理数据\n",
    "# 2.1 问号处理\n",
    "data.replace(to_replace='?', value=np.nan, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2.2 删除缺失值\n",
    "data.dropna(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2.3 提取特征值，目标值\n",
    "x = data.iloc[:, 1:10]\n",
    "y = data.Class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2.4 数据分割\n",
    "x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2, test_size=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3. 特征工程-数据标准化\n",
    "# 3.1 实例化标准化转换器\n",
    "transfer = StandardScaler()\n",
    "\n",
    "# 3.2 特征数据标准化\n",
    "x_train = transfer.fit_transform(x_train)\n",
    "x_test = transfer.fit_transform(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LogisticRegression()"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 4. 机器学习-逻辑回归\n",
    "# 4.1 实例化逻辑回归估计器\n",
    "estimator = LogisticRegression()\n",
    "\n",
    "# 4.2 训练模型\n",
    "estimator.fit(x_train, y_train)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "预测目标值和测试目标值对比:\n",
      " 109    True\n",
      "210    True\n",
      "647    True\n",
      "569    True\n",
      "309    True\n",
      "       ... \n",
      "115    True\n",
      "328    True\n",
      "462    True\n",
      "534    True\n",
      "147    True\n",
      "Name: Class, Length: 137, dtype: bool\n",
      "模型预测准确率:\n",
      " 0.9343065693430657\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "          恶性       0.94      0.95      0.95        83\n",
      "          良性       0.92      0.91      0.92        54\n",
      "\n",
      "    accuracy                           0.93       137\n",
      "   macro avg       0.93      0.93      0.93       137\n",
      "weighted avg       0.93      0.93      0.93       137\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# 5. 模型评估\n",
    "# 5.1 判断预测目标值和测试目标值\n",
    "y_pre = estimator.predict(x_test)\n",
    "print(\"预测目标值和测试目标值对比:\\n\", y_pre == y_test)\n",
    "\n",
    "# 5.2 模型预测准确率\n",
    "print(\"模型预测准确率:\\n\", estimator.score(x_test, y_test))\n",
    "\n",
    "# 5.3 分类评估报告\n",
    "# 如果要显示恶性和良性，则要协商下面对应的labels和target_name参数，否则可读性不高\n",
    "ret = classification_report(y_test, y_pre, labels=(2, 4), target_names=('恶性', '良性'))\n",
    "print(ret)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "\n",
    "\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
